The purpose of this document is to examine codon usage patterns at the genome-wide level across four Strongyloides species, Brugia malayi, Nippostrongylus brasiliensis, Pristionchus pacificus, and Caenorhabditis elegans. In addition, this documents examines codon usage patterns for functional subsets of genes in Strongyloides species and C. elegans.
For the functional subsets, eight groups are analyzed:
1) Top 2% of codon adapted genes in each genome
2) Bottom 2% of codon adapted genes in each genome
3) Top 2% of GC-rich genes in each genome
4) Top 2% of AT-rich genes in each genome
5) Top 2% of Strongyloides genes expressed in free-living adult females.
Full lists of CDS sequences for S. stercoralis, S. ratti, S. papillosus, S. venezuelensis, B. malayi, N. brasiliensis, P. pacificus, and C. elegans were download from Wormbase ParaSite (v15) between November 17, 2020 and March 31, 2021. The .fa files were used as input to the Wild Worm Codon Adapter App. For each species an excel report containing the computed codon adaptation index values relative to the available built-in codon usage rules were downloaded; these file are uploaded the code chunks below.
For each species, load data listing GC content and CAI indeces for the full list of CDS elements in genome. The full list contains transcript level information. If downstream tidy version of this data is avliable or the filtered versions are avaliable as files, don’t regenerate them.
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
temp <- c(Ss = '../Data/Ss_Codon_Usage_Report.xlsx',
Sr = '../Data/Sr_Codon_Usage_Report.xlsx',
Sp = '../Data/Sp_Codon_Usage_Report.xlsx',
Sv = '../Data/Sv_Codon_Usage_Report.xlsx',
Nb = '../Data/Nb_Codon_Usage_Report.xlsx',
Pp = '../Data/Pp_Codon_Usage_Report.xlsx',
Ce = '../Data/Ce_Codon_Usage_Report.xlsx',
Bm = '../Data/Bm_Codon_Usage_Report.xlsx')
dat.genomic <- sapply(temp, read_excel,
sheet = 1,
skip = 3,
col_names = T,
simplify = F,
USE.NAMES = T)
# This data contains transcript level information.
# Commented code would trim rows such that in cases where there are
# multiple transcripts per gene, only
# values from the longest transcript would be included.
# This was originally included to permit comparison to gene expression databases
# but is no longer required.
dat.genomic.cleaned <- lapply(seq_along(dat.genomic), function(x){
dat.genomic[[x]] %>%
dplyr::mutate(transcriptID = geneID) %>%
dplyr::mutate(Nb_CAI = if("Nb_CAI" %in% colnames(.)) Nb_CAI else NA) %>%
dplyr::mutate(Bm_CAI = if("Bm_CAI" %in% colnames(.)) Bm_CAI else NA) %>%
dplyr::mutate(Pp_CAI = if("Pp_CAI" %in% colnames(.)) Pp_CAI else NA) %>%
dplyr::mutate(species = names(temp)[[x]])
})
names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Nb","Pp", "Ce", "Bm")
}
Tidy the genomic CAI values for downstream plotting and analysis. This can be computationally intensive, so if the code is run, a tidy version will get saved at the end of the Markdown file as a .csv file. If that file exists, don’t re-run this code.
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
dat.genomic.df <- dat.genomic.cleaned %>%
map("geneID") %>%
unlist() %>%
as_tibble_col(column_name = "geneID") %>%
group_by(geneID)
dat.genomic.df <- dat.genomic.cleaned %>%
map("transcriptID") %>%
unlist() %>%
as_tibble_col(column_name = "transcriptID") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Sr_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Sr_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Bm_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Bm_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Nb_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Nb_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Pp_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Pp_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Ce_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Ce_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("GC") %>%
unlist() %>%
as_tibble_col(column_name = "GC") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("species") %>%
unlist() %>%
as_tibble_col(column_name = "species") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- group_by(dat.genomic.df, species)
}
# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
write.table(dat.genomic.df,
file = "../Data/tidy_genomic_CAI.csv",
sep = ",",
col.names = T,
row.names = FALSE)
}
This section takes the calculated codon adaptation index values for all predicted coding sequences, and generates a density curve for each species. Ideally, this plot could be used to benchmark the calculated CAI values against other metrics of codon bias in these species. For example, Mitreva et al 2006 reported effective number of codons (ENC) as a measure of gene-wise codon bias, across species. They report that “mean ENC across all sampled nemtaode species is 46.7 +/- 5.1” but that S. stercoralis and S. ratti are outliers with low ENC values (~35). Note: ENC values range from 20 for highly biased to 61 for no bias. Thus, benchmarking from Mitreva et al 2006, we might expect that Strongyloides species will have greater codon bias than C. elegans. Unfortunately, although they do report measuring distribution of ENC values across all transcripts for each species, they only have plots for S. ratti, P. trichosuri and P. pacificus.
dat.genomic.df <- suppressWarnings(
read.csv("../Data/tidy_genomic_CAI.csv",
header = TRUE)) %>%
as_tibble() %>%
dplyr::mutate(species = factor(species,
levels = c(
"Ss","Sr",
"Sp", "Sv",
"Nb","Bm",
"Pp","Ce"
)))%>%
group_by(species)
species.specific.dat.df <- dat.genomic.df %>%
dplyr::mutate(species_CAI = case_when(
species == 'Ce' ~ Ce_CAI,
species == 'Bm' ~ Bm_CAI,
species == 'Nb' ~ Nb_CAI,
species == 'Pp' ~ Pp_CAI,
TRUE ~ Sr_CAI
)) %>%
dplyr::mutate(outgroup_CAI = case_when(
species == 'Ce' ~ Sr_CAI,
species != 'Ce' ~ Ce_CAI
))
plot_distributions <- function(dat,
x,
title,
xlabel,
caption = "",
ylabel = "Density (scaled to maximum of 1)",
xlim = c(0,1),
ylim = c(0,1)) {
dat %>%
ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
geom_freqpoly(bins = 50) +
scale_color_carto_d(palette = "Bold",
name = "Species",
labels = c("S. stercoralis",
"S. ratti",
"S. papillosus",
"S. venezuelensis",
"N. brasiliensis",
"B. malayi",
"P. pacificus",
"C. elegans"
)) +
labs(title = title,
subtitle = "Data includes all predicted coding sequences",
x = xlabel,
y = ylabel,
caption = caption) +
coord_equal(xlim = xlim, ylim = ylim) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 10, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
species.specific.dat.df$species_CAI,
"Species-wide codon adaptiveness",
"Codon bias relative to \n genus coding usage rules (CAI)",
"Strongyloides species values relative to S. ratti usage rules
C. elegans values relative to C. elegans usage rules.
N. brasilensis values relative to N. brasilensis usage rules.
P. pacificus values relative to P. pacificus usage rules.
B. malayi values relative to B. malayi ussage rules.")
species_adaptiveness_plot
Notes re: the above plot - C. elegans shows overall lower codon bias compared to the Strongyloides species, as predicted from the Mitreva et al 2006 results. Reminder that Mitreva et al reported that this difference is greatly affected by the GC content; more AT rich speices like the Strongyloides species have greater codon usage biases. Also, it looks like the Strongyloides subclades (S. ratti - S. stercoralis; S. papillosus - S. venezuelensis) cluster together.
# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)
# Post-hoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 125351.5615, df = 7, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | Bm Ce Nb Pp Sp Sr
## ---------+------------------------------------------------------------------
## Ce | 200.5732
## | 0.0000*
## |
## Nb | -54.78451 -290.8693
## | 0.0000* 0.0000*
## |
## Pp | 2.222235 -232.9582 65.91399
## | 0.3677 0.0000* 0.0000*
## |
## Sp | 131.6552 -60.64074 203.3983 147.8540
## | 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Sr | 70.50996 -108.3044 127.9189 76.29029 -50.89541
## | 0.0000* 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Ss | 82.75389 -97.34878 142.1991 90.15316 -39.82053 10.77989
## | 0.0000* 0.0000* 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Sv | 128.6586 -59.41825 198.0473 143.6211 -0.361836 49.65166
## | 0.0000* 0.0000* 0.0000* 0.0000* 1.0000 0.0000*
## Col Mean-|
## Row Mean | Ss
## ---------+-----------
## Sv | 38.75182
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
temp <- species.specific.dat.df %>%
dplyr::mutate(group = case_when(
species == 'Ss' ~ 'Str',
species == 'Sr' ~ 'Str',
species == 'Sp' ~ 'Str',
species == 'Sv' ~ 'Str',
species == 'Nb' ~ 'Nb',
species == 'Bm' ~ 'Bm',
species == 'Pp' ~ 'Pp',
species == 'Ce' ~ 'Ce'
)) %>%
dplyr::mutate(group = as_factor(group))
one.way <- kruskal.test(species_CAI ~ group, data = temp)
post.hoc <- dunn.test::dunn.test(temp$species_CAI, temp$group, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 121238.4534, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | Bm Ce Nb Pp
## ---------+--------------------------------------------
## Ce | 200.5732
## | 0.0000*
## |
## Nb | -54.78451 -290.8693
## | 0.0000* 0.0000*
## |
## Pp | 2.222235 -232.9582 65.91399
## | 0.1313 0.0000* 0.0000*
## |
## Str | 134.8566 -110.3365 231.1230 162.7482
## | 0.0000* 0.0000* 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
This section takes the calculated GC content values for all predicted coding sequences, and generates a density curve for each species.
species.specific.GC.df <- dat.genomic.df %>%
dplyr:: select(geneID, GC, species) %>%
group_by(species)
species_GC_plot <- plot_distributions(species.specific.GC.df,
species.specific.GC.df$GC,
"Species-wide GC content",
"GC content")
species_GC_plot
# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)
# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 119240.1549, df = 7, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | Bm Ce Nb Pp Sp Sr
## ---------+------------------------------------------------------------------
## Ce | -46.80080
## | 0.0000*
## |
## Nb | -110.4037 -77.00632
## | 0.0000* 0.0000*
## |
## Pp | -110.8558 -76.88249 3.064646
## | 0.0000* 0.0000* 0.0305
## |
## Sp | 66.51547 126.6637 190.1689 193.4193
## | 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Sr | 92.29684 147.4780 203.8017 206.4012 33.31895
## | 0.0000* 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Ss | 88.98603 144.8369 202.0447 204.7239 29.07753 -4.308830
## | 0.0000* 0.0000* 0.0000* 0.0000* 0.0000* 0.0002*
## |
## Sv | 70.30050 129.1726 191.1602 194.2058 5.376535 -27.86798
## | 0.0000* 0.0000* 0.0000* 0.0000* 0.0000* 0.0000*
## Col Mean-|
## Row Mean | Ss
## ---------+-----------
## Sv | -23.62159
## | 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
temp <- species.specific.GC.df %>%
dplyr::mutate(group = case_when(
species == 'Ss' ~ 'Str',
species == 'Sr' ~ 'Str',
species == 'Sp' ~ 'Str',
species == 'Sv' ~ 'Str',
species == 'Nb' ~ 'Nb',
species == 'Bm' ~ 'Bm',
species == 'Pp' ~ 'Pp',
species == 'Ce' ~ 'Ce'
)) %>%
dplyr::mutate(group = as_factor(group))
one.way <- kruskal.test(GC ~ group, data = temp)
post.hoc <- dunn.test::dunn.test(temp$GC, temp$group, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 117564.9849, df = 4, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | Bm Ce Nb Pp
## ---------+--------------------------------------------
## Ce | -46.80080
## | 0.0000*
## |
## Nb | -110.4037 -77.00632
## | 0.0000* 0.0000*
## |
## Pp | -110.8558 -76.88249 3.064646
## | 0.0000* 0.0000* 0.0109*
## |
## Str | 98.76053 189.7851 263.9731 274.5119
## | 0.0000* 0.0000* 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
Take genome-wide CAI values, and for each species filter the top 2% and also bottom 2% of codon adapted genes. Will run GO analysis on these to determine whether there are enriched GO Terms.
percentile = 0.02
# For following analyses, only look at Strongyloides and C. elegans,
# not B. malayi or N. brasiliensis
species.specific.dat.df<-species.specific.dat.df %>%
dplyr::filter(species != "Bm") %>%
dplyr::filter(species != "Nb")%>%
dplyr::filter(species != "Pp")
dat.Top <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Top")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = species_CAI, prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Top)
dat.Bot <- species.specific.dat.df %>%
dplyr::mutate(Description = factor("Bot")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = desc(species_CAI), prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Bot)
This section/plot asks questions like: Are the most-codon-adapted Strongyloides genes less codon adapted relative to the C. elegans usage rules? Similarly, are the least-codon-adapted genes less codon adapted relative to the other usage rule. Also, is there a consistent relationship between the degree of codon adaptation and GC content (a common confounding factor).
tbl <- rbind(dat.Top, dat.Bot) %>%
dplyr::mutate(Description = dplyr::recode(Description, Top = "Most-adapted genes",
Bot = "Least-adapted genes"))
ggplot(tbl, aes(species_CAI,GC, Description)) +
geom_point(data = species.specific.dat.df,
mapping = aes(x = species_CAI, y = GC),color = "grey", shape = 1) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_fixed (3) +
facet_grid( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (genus rules)",
subtitle = "grey icons = all genes \n colored icons = most and least adapted genes ") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(tbl, aes(outgroup_CAI,GC, Description)) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_equal()+
facet_wrap( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to non-genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (non-genus rules)") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8))
Do genes with the top and bottom 2% of CAI values show particularly high or low expression levels? We might expect that genes with the top 2% of CAI values, where the CAI is based on highly expressed S. ratti ESTs might tend to show higher expression levels.
temp <- c(Ss = '../Data/SsRNAseq_log2cpm_filtered_norm_voom.csv',
Sr = '../Data/SrRNAseq_log2cpm_filtered_norm_voom.csv',
Sp = '../Data/SpRNAseq_log2cpm_filtered_norm_voom.csv',
Sv = '../Data/SvRNAseq_log2cpm_filtered_norm_voom.csv')
dat.expression <- sapply(temp, read.csv,
header = TRUE)
species_names <- names(dat.expression)
dat.Top.expression <- lapply(species_names, function(x) {
dat.Top.select <- dplyr::filter(dat.Top, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Top.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Top.expression) <- species_names
dat.Top.expression <- map_dfr(dat.Top.expression, bind_rows, .id = "species")
dat.Bot.expression <- lapply(species_names, function(x) {
dat.Bot.select <- dplyr::filter(dat.Bot, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Bot.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Bot.expression) <- species_names
dat.Bot.expression <- map_dfr(dat.Bot.expression, bind_rows, .id = "species")
dat.expression <- lapply(species_names, function(x) {
dat.expression[[x]] %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.expression) <- species_names
dat.expression <- map_dfr(dat.expression, bind_rows, .id = "species")
lapply(species_names, function(x){
plotdata.subset1 <- filter(dat.Top.expression, species == x)
plotdata.subset2 <- filter(dat.Bot.expression, species == x)
plotdata.all <- filter(dat.expression, species == x)
plot.df <- bind_rows(HighCAI = plotdata.subset1,
LowCAI = plotdata.subset2,
AllGenes = plotdata.all, .id = "Description")
p.Bot.expression <- ggplot(plot.df) +
aes(x=stage, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
alpha = .5,
position = position_dodge(width = 1),
trim = FALSE) +
scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
show.legend = FALSE) +
facet_grid(~stage, scales = "free") +
labs(y="log2CPM Expression", x = "Life Stage",
title = paste0(x, " - log2 Counts per Million (CPM)"),
caption=paste0("produced on ", Sys.time())) +
theme_bw()
# suppressMessages(ggsave(paste0(x,"_CAIExpression.pdf"),
# plot = p.Bot.expression,
# device = "pdf",
# height = 4,
# width = 7,
# path = output.path,
# useDingbats=FALSE))
p.Bot.expression
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
Does CAI value correspond with expression in free-living females
temp <- lapply(species_names, function(x) {
dat.select <- dplyr::filter(species.specific.dat.df,
species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
dplyr::ungroup() %>%
dplyr::select(geneID, species_CAI)
subset.expression <- dat.expression %>%
dplyr::filter(species == x)%>%
dplyr::filter(stage == "FLF") %>%
dplyr::filter(geneID %in% dat.select$geneID)
full_join(dat.select, subset.expression, by = "geneID") %>%
dplyr::select(!stage)
})
names(temp) <- species_names
caiExp.df <- map_dfr(temp, bind_rows, .id = "species")
caiExp.df <- bind_rows(AllGenes = caiExp.df,
HighCAI = dplyr::filter(caiExp.df, geneID %in% dat.Top.expression$geneID),
LowCAI = dplyr::filter(caiExp.df, geneID %in% dat.Bot.expression$geneID),
.id = "Description") %>%
dplyr::mutate(species = as_factor(species))
caiExp.highExp_plot <- ggplot(caiExp.df) +
aes(x=Description, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
position = position_dodge(width = 1),
trim = FALSE,
color = 'black') +
#scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
color = "black",
show.legend = FALSE) +
facet_grid( ~species) +
labs(y="log2CPM Expression", x = "Gene subsets (base on CAI value) by species",
title = "log2 Counts per Million (CPM) in free-living female as a function of codon bias",
subtitle = "grey = all genes \ncolors = top/bottom 2% CAI",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
caiExp.highExp_plot
suppressPackageStartupMessages({library(car)})
cpm.merged <- rbind(dat.expression %>% dplyr::mutate(condition = "AllGenes"),
dat.Top.expression %>% dplyr::mutate(condition = "HighCPM"),
dat.Bot.expression %>% dplyr::mutate(condition = "LowCPM")) %>%
dplyr::mutate(condition = as_factor(condition))%>%
group_by(species, geneID, condition, stage)
options(contrasts = c("contr.sum", "contr.poly"))
lapply(species_names, function(x) {
cpm.merged.sub <- filter(cpm.merged, species == x)
res.aov2<- aov(expression ~ condition * stage, data = cpm.merged.sub)
Anova(res.aov2, type = "III")
res.aov3<-TukeyHSD(res.aov2, "condition:stage")
as_tibble(res.aov3$`condition:stage`, rownames = "comparison") %>%
separate(col = comparison, into = c("comp1", "LS1", "comp2", "LS2")) %>%
dplyr::filter(LS1 == LS2) %>%
unite(temp1, comp1, LS1, sep = ":")%>%
unite(temp2, comp2, LS2, sep = ":")%>%
unite(comparison, temp1, temp2, sep = "-")
})
## [[1]]
## # A tibble: 21 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 1.79 0.960 2.63 3.50e-12
## 2 LowCPM:FLF-AllGenes:FLF -1.45 -2.41 -0.497 1.20e- 5
## 3 LowCPM:FLF-HighCPM:FLF -3.25 -4.50 -1.99 2.38e-13
## 4 HighCPM:iL3-AllGenes:iL3 0.665 -0.168 1.50 3.43e- 1
## 5 LowCPM:iL3-AllGenes:iL3 -2.05 -3.00 -1.09 4.68e-12
## 6 LowCPM:iL3-HighCPM:iL3 -2.71 -3.97 -1.45 3.04e-12
## 7 HighCPM:iL3a-AllGenes:iL3a 1.10 0.266 1.93 4.83e- 4
## 8 LowCPM:iL3a-AllGenes:iL3a -1.76 -2.71 -0.801 1.14e- 8
## 9 LowCPM:iL3a-HighCPM:iL3a -2.86 -4.11 -1.60 3.64e-13
## 10 HighCPM:PF-AllGenes:PF 0.943 0.109 1.78 9.21e- 3
## # … with 11 more rows
##
## [[2]]
## # A tibble: 12 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 1.57 0.785 2.35 3.69e- 9
## 2 LowCPM:FLF-AllGenes:FLF -4.00 -4.93 -3.08 0.
## 3 LowCPM:FLF-HighCPM:FLF -5.57 -6.77 -4.37 0.
## 4 HighCPM:FLM-AllGenes:FLM 3.22 2.44 4.00 0.
## 5 LowCPM:FLM-AllGenes:FLM -2.72 -3.65 -1.80 8.69e-14
## 6 LowCPM:FLM-HighCPM:FLM -5.94 -7.14 -4.74 0.
## 7 HighCPM:iL3-AllGenes:iL3 0.607 -0.174 1.39 3.15e- 1
## 8 LowCPM:iL3-AllGenes:iL3 -2.77 -3.70 -1.85 1.23e-13
## 9 LowCPM:iL3-HighCPM:iL3 -3.38 -4.58 -2.18 1.36e-13
## 10 HighCPM:PF-AllGenes:PF 0.175 -0.605 0.956 1.00e+ 0
## 11 LowCPM:PF-AllGenes:PF -3.93 -4.85 -3.01 0.
## 12 LowCPM:PF-HighCPM:PF -4.11 -5.31 -2.91 2.93e-14
##
## [[3]]
## # A tibble: 18 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 2.60 1.86 3.33 0.
## 2 LowCPM:FLF-AllGenes:FLF -0.806 -1.66 0.0522 9.58e- 2
## 3 LowCPM:FLF-HighCPM:FLF -3.40 -4.52 -2.28 1.69e-13
## 4 HighCPM:flL1L2-AllGenes:flL1L2 3.67 2.94 4.41 0.
## 5 LowCPM:flL1L2-AllGenes:flL1L2 -1.76 -2.62 -0.901 1.31e-10
## 6 LowCPM:flL1L2-HighCPM:flL1L2 -5.43 -6.55 -4.31 0.
## 7 HighCPM:FLM-AllGenes:FLM 2.60 1.86 3.33 0.
## 8 LowCPM:FLM-AllGenes:FLM -1.50 -2.36 -0.642 1.60e- 7
## 9 LowCPM:FLM-HighCPM:FLM -4.10 -5.22 -2.98 0.
## 10 HighCPM:iL3-AllGenes:iL3 1.04 0.307 1.78 1.11e- 4
## 11 LowCPM:iL3-AllGenes:iL3 -1.53 -2.39 -0.669 8.02e- 8
## 12 LowCPM:iL3-HighCPM:iL3 -2.57 -3.69 -1.45 4.28e-13
## 13 HighCPM:PF-AllGenes:PF 1.92 1.18 2.65 2.29e-13
## 14 LowCPM:PF-AllGenes:PF -0.938 -1.80 -0.0804 1.61e- 2
## 15 LowCPM:PF-HighCPM:PF -2.85 -3.97 -1.73 1.29e-13
## 16 HighCPM:pL1L2-AllGenes:pL1L2 3.65 2.91 4.38 0.
## 17 LowCPM:pL1L2-AllGenes:pL1L2 -1.75 -2.61 -0.897 1.48e-10
## 18 LowCPM:pL1L2-HighCPM:pL1L2 -5.40 -6.52 -4.28 0.
##
## [[4]]
## # A tibble: 6 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 1.12 0.544 1.70 5.00e- 7
## 2 LowCPM:FLF-AllGenes:FLF -1.55 -2.32 -0.789 1.05e- 7
## 3 LowCPM:FLF-HighCPM:FLF -2.68 -3.63 -1.72 9.73e-14
## 4 HighCPM:PF-AllGenes:PF 0.493 -0.0869 1.07 1.48e- 1
## 5 LowCPM:PF-AllGenes:PF -1.61 -2.37 -0.846 2.91e- 8
## 6 LowCPM:PF-HighCPM:PF -2.10 -3.06 -1.15 4.66e- 9
Take lists of the top percentile of codon adapted genes in each species, and run GO analysis to see if these genes are enriched for anything interesting.
# Carry out GO enrichment using gProfiler2 ----
run_GO <- function (data){
GO.geneID <- data %>%
ungroup() %>%
group_by(species) %>%
dplyr::group_map(~ {
gost.res <- .x %>%
dplyr::select(geneID, GC) %>%
unique()
}, .keep = TRUE)
names(GO.geneID) <- levels(data$species)
gost.results <- lapply(names(GO.geneID), function(y){
organism = case_when (y == "Ss" ~ "ststerprjeb528",
y == "Sr" ~ "strattprjeb125",
y == "Sp" ~ "stpapiprjeb525",
y == "Sv" ~ "stveneprjeb530",
y == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID[[y]]$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results) <- names(GO.geneID)
gost.results
}
find_enrichments <- function (gost.results) {
# Find GO terms that are enriched with p-values equal or small than a threshold.
enriched.terms <- lapply(names(gost.results), function(y){
gost.results[[y]]$result %>%
as_tibble() %>%
dplyr::select(term_id, p_value)
})
names(enriched.terms) <- names(gost.results)
enriched.terms
}
find_common_enrichments <- function (data, p.val,n_groups){
data %>%
dplyr::filter(p_value <= p.val) %>%
group_by(term_id) %>%
summarize(n= n()) %>%
dplyr::filter(n >= n_groups)
}
gost.results.top <- run_GO(dat.Top)
enriched.terms.top <- find_enrichments(gost.results.top)
highTerms.overlap.top <- enriched.terms.top %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.top$Ss, interactive = T, capped = T)
gostplot(gost.results.top$Sr, interactive = T, capped = T)
gostplot(gost.results.top$Sp, interactive = T, capped = T)
gostplot(gost.results.top$Sv, interactive = T, capped = T)
gostplot(gost.results.top$Ce, interactive = T, capped = T)
GO terms that are commonly enriched in the top percentile of codon adapted genes in all Strongyloides species tested:
dplyr::filter(gost.results.top$Ss$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many of the common Strongyloides go terms are found in the C. elegans terms?
dplyr::filter(gost.results.top$Ce$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name)
How many are unique to the Strongyloides species?
temp <- dplyr::filter(highTerms.overlap.top,
!term_id %in% gost.results.top$Ce$result$term_id)
dplyr::filter(gost.results.top$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Take lists of the bottom percentile of codon adapted genes in each species, and run GO analysis to see if these genes are enriched for anything interesting.
# Carry out GO enrichment using gProfiler2 ----
gost.results.bot <- run_GO(dat.Bot)
enriched.terms.bot <- find_enrichments(gost.results.bot)
highTerms.overlap.bot <- enriched.terms.bot %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.bot$Ss, interactive = T, capped = T)
gostplot(gost.results.bot$Sr, interactive = T, capped = T)
gostplot(gost.results.bot$Sp, interactive = T, capped = T)
gostplot(gost.results.bot$Sv, interactive = T, capped = T)
gostplot(gost.results.bot$Ce, interactive = T, capped = T)
GO terms that are commonly enriched in the bottom percentile of codon adapted genes in all Strongyloides species tested:
dplyr::filter(gost.results.bot$Ss$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
How many of the common Strongyloides go terms are found in the C. elegans terms?
dplyr::filter(gost.results.bot$Ce$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
How many are unique to the Strongyloides species?
temp <- dplyr::filter(highTerms.overlap.bot,
!term_id %in% gost.results.bot$Ce$result$term_id)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Generate lists of genes with the highest expression in free-living females in each species, and run GO analysis to see if these genes are enriched for anything interesting. Here, common GO terms are defined as terms that appear in 3 or 4 species, because the results from S. papillosus are divergent from the others.
gost.results.highexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = expression, prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.highexp) <- species_names
enriched.terms.highexp <- find_enrichments(gost.results.highexp)
highTerms.overlap.highexp <- enriched.terms.highexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
What GO terms are commonly enriched in highly expressed genes in 3 of 4 species?
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% highTerms.overlap.highexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many GO terms are enriched in both highly expressed sequences and highest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in highly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
How many GO terms are enriched in both highly expressed sequences and poorest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in highly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Generate lists of genes with lowest expression in free-living females in each species, and run GO analysis to see if these genes are enriched for anything interesting. Here, common GO terms are defined as terms that appear in 3 or 4 species, because the results from S. papillosus are divergent from the others.
gost.results.lowexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = desc(expression), prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.lowexp) <- species_names
enriched.terms.lowexp <- find_enrichments(gost.results.lowexp)
highTerms.overlap.lowexp <- enriched.terms.lowexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
What GO terms are commonly enriched in poorly expressed genes in 3 of 4 species?
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% highTerms.overlap.lowexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many GO terms are enriched in both poorly expressed sequences and highest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in poorly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
How many GO terms are enriched in both poorly expressed sequences and poorest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in poorly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Saving data and plots generated by the above code.
# Save distribution plot of codon adaptivness by species
ggsave('../Outputs/Fig_S3A.pdf',
plot = species_adaptiveness_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save distribution plot of GC content by species
ggsave('../Outputs/Fig_S3B.pdf',
plot = species_GC_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save plot of CAI vs FLF expression level
ggsave('../Outputs/Fig_S3C.pdf',
plot = caiExp.highExp_plot,
width = 5.5, height = 3,
units = "in", device = "pdf",
useDingbats = FALSE)
write_excel <- function(data, sheet_facet, filename){
file <- paste(filename,".xlsx",sep = "")
# Workbook
to_download <<- createWorkbook()
sapply(seq_along(sheet_facet), function(x){
addWorksheet(wb = to_download, sheetName = sheet_facet[[x]])
sheet_data <- data %>%
dplyr::filter(species == sheet_facet[[x]]) %>%
dplyr::select(!species)
## Results of codon usage analysis
writeData(
to_download,
sheet = x,
x = sheet_data,
startRow = 1,
startCol = 1,
headerStyle = createStyle(
textDecoration = "Bold",
halign = "center",
border = "bottom"
)
)
})
saveWorkbook(to_download, file, overwrite=TRUE)
}
# Save files with top and bottom percentiles of codon adapted genes per species,
# with the GO terms associated with those genes from bioMaRt
Annt1<- getBM(attributes=c('wbps_transcript_id',
'go_accession'),
# grab the ensembl annotations for Wormbase Parasite genes
mart = useMart(biomart="parasite_mart",
dataset = "wbps_gene",
host="https://parasite.wormbase.org",
port = 443),
filters = c('species_id_1010','wbps_transcript_id'),
value = list(c('ststerprjeb528',
'stpapiprjeb525',
'stveneprjeb530'
),
tbl$geneID)) %>%
as_tibble() %>%
group_by(wbps_transcript_id) %>%
dplyr::summarise("Term ID" = paste(go_accession, collapse = ", ")) %>%
dplyr::rename(geneID = wbps_transcript_id)
Annt2<- getBM(attributes=c('external_gene_id',
'go_accession'),
# grab the ensembl annotations for Wormbase Parasite genes
mart = useMart(biomart="parasite_mart",
dataset = "wbps_gene",
host="https://parasite.wormbase.org",
port = 443),
filters = c('species_id_1010',
'gene_name'),
value = list('strattprjeb125',
tbl$geneID)) %>%
as_tibble() %>%
group_by(external_gene_id) %>%
dplyr::summarise("Term ID" = paste(go_accession, collapse = ", ")) %>%
dplyr::rename(geneID = external_gene_id)
Annt3<- getBM(attributes=c('wormbase_gseq',
'go_accession'),
# grab the ensembl annotations for Wormbase Parasite genes
mart = useMart(biomart="parasite_mart",
dataset = "wbps_gene",
host="https://parasite.wormbase.org",
port = 443),
filters = c('species_id_1010',
'wormbase_gseqname'),
value = list('caelegprjna13758',
tbl$geneID)) %>%
as_tibble() %>%
group_by(wormbase_gseq) %>%
dplyr::summarise("Term ID" = paste(go_accession, collapse = ", ")) %>%
dplyr::rename(geneID = wormbase_gseq)
rbind(Annt1, Annt2, Annt3)%>%
left_join(tbl, ., by = "geneID")%>%
dplyr::select(geneID, species_CAI, GC, Description, species, "Term ID") %>%
dplyr::rename("Gene ID" = geneID,
"CAI by genus-specific usage rule" = species_CAI,
"GO term ID" = "Term ID") %>%
write_excel(levels(tbl$species), '../Outputs/File_S3')
# Save list of all GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes
enriched.GO.terms <- rbind(
lowCAI = lapply(names(gost.results.bot), function(y){
gost.results.bot[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Least-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highCAI = lapply(names(gost.results.top), function(y){
gost.results.top[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Most-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything())
)
write_excel(enriched.GO.terms, levels(enriched.GO.terms$species), '../Outputs/File_S4')
enriched.GO.terms.expression <- rbind(
lowFLF = lapply(names(gost.results.lowexp), function(y){
gost.results.lowexp[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv"))) %>%
dplyr::mutate(Description = "Least-expressed genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highFLF = lapply(names(gost.results.highexp), function(y){
gost.results.highexp[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv"))) %>%
dplyr::mutate(Description = "Most-expressed genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything())
)
# Save list of GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes and genes that are highly expressed/poorly expressed in FLF
common.terms <- rbind (lowCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.bot$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Least-adapted genes"),
highCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.top$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Most-adapted genes"),
Highexp = dplyr::filter(enriched.GO.terms.expression, term_id %in% highTerms.overlap.highexp$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Highly-expressed genes"),
PoorExp = dplyr::filter(enriched.GO.terms.expression, term_id %in% highTerms.overlap.lowexp$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Poorly-expressed genes")
) %>%
dplyr::mutate(Description = factor(Description)) %>%
dplyr::rename(species = Description, "Term ID" = term_id, "Term Name" = term_name, Source = source, "Contributing Genes" = Genes)
write_excel(common.terms, levels(common.terms$species), '../Outputs/File_S5')
# Save Functional Enrichment Plots for bottom percentile
lapply(names(gost.results.bot), function(y){
gost.res <- gost.results.bot[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Bottom", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.bot$term_id)
ggsave(paste0('../Outputs/Fig_4C_',y,'.pdf'), plot = gost.ggplot.pub,
width = 7, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
})
# Save Functional Enrichment Plots for top percentile
lapply(names(gost.results.top), function(y){
gost.res <- gost.results.top[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Top", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.top$term_id)
ggsave(paste0('../Outputs/Fig_4A_',y,'.pdf'), plot = gost.ggplot.pub,
width = 7, height = 5,
units = "in", device = "pdf",
useDingbats = FALSE)
})
suppressPackageStartupMessages({
library(knitr)
library(rmarkdown)
library(tidyverse)
library(readxl)
library(magrittr)
library(DescTools)
library(ggthemes)
library(biomaRt)
library(ggplot2)
library(openxlsx)
library(wrapr)
library(Matching)
library(cowplot)
library(gprofiler2)
library(rcartocolor)
source('fetch_cDNAseq.R')
})
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# Check for presence of output plots folder, generate if it doesn't exist
output.path <- "../Outputs"
if (!dir.exists(output.path)){
dir.create(output.path)
}
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
temp <- c(Ss = '../Data/Ss_Codon_Usage_Report.xlsx',
Sr = '../Data/Sr_Codon_Usage_Report.xlsx',
Sp = '../Data/Sp_Codon_Usage_Report.xlsx',
Sv = '../Data/Sv_Codon_Usage_Report.xlsx',
Nb = '../Data/Nb_Codon_Usage_Report.xlsx',
Pp = '../Data/Pp_Codon_Usage_Report.xlsx',
Ce = '../Data/Ce_Codon_Usage_Report.xlsx',
Bm = '../Data/Bm_Codon_Usage_Report.xlsx')
dat.genomic <- sapply(temp, read_excel,
sheet = 1,
skip = 3,
col_names = T,
simplify = F,
USE.NAMES = T)
# This data contains transcript level information.
# Commented code would trim rows such that in cases where there are
# multiple transcripts per gene, only
# values from the longest transcript would be included.
# This was originally included to permit comparison to gene expression databases
# but is no longer required.
dat.genomic.cleaned <- lapply(seq_along(dat.genomic), function(x){
dat.genomic[[x]] %>%
dplyr::mutate(transcriptID = geneID) %>%
dplyr::mutate(Nb_CAI = if("Nb_CAI" %in% colnames(.)) Nb_CAI else NA) %>%
dplyr::mutate(Bm_CAI = if("Bm_CAI" %in% colnames(.)) Bm_CAI else NA) %>%
dplyr::mutate(Pp_CAI = if("Pp_CAI" %in% colnames(.)) Pp_CAI else NA) %>%
dplyr::mutate(species = names(temp)[[x]])
})
names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Nb","Pp", "Ce", "Bm")
}
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
dat.genomic.df <- dat.genomic.cleaned %>%
map("geneID") %>%
unlist() %>%
as_tibble_col(column_name = "geneID") %>%
group_by(geneID)
dat.genomic.df <- dat.genomic.cleaned %>%
map("transcriptID") %>%
unlist() %>%
as_tibble_col(column_name = "transcriptID") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Sr_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Sr_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Bm_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Bm_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Nb_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Nb_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Pp_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Pp_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Ce_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Ce_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("GC") %>%
unlist() %>%
as_tibble_col(column_name = "GC") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("species") %>%
unlist() %>%
as_tibble_col(column_name = "species") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- group_by(dat.genomic.df, species)
}
# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
write.table(dat.genomic.df,
file = "../Data/tidy_genomic_CAI.csv",
sep = ",",
col.names = T,
row.names = FALSE)
}
dat.genomic.df <- suppressWarnings(
read.csv("../Data/tidy_genomic_CAI.csv",
header = TRUE)) %>%
as_tibble() %>%
dplyr::mutate(species = factor(species,
levels = c(
"Ss","Sr",
"Sp", "Sv",
"Nb","Bm",
"Pp","Ce"
)))%>%
group_by(species)
species.specific.dat.df <- dat.genomic.df %>%
dplyr::mutate(species_CAI = case_when(
species == 'Ce' ~ Ce_CAI,
species == 'Bm' ~ Bm_CAI,
species == 'Nb' ~ Nb_CAI,
species == 'Pp' ~ Pp_CAI,
TRUE ~ Sr_CAI
)) %>%
dplyr::mutate(outgroup_CAI = case_when(
species == 'Ce' ~ Sr_CAI,
species != 'Ce' ~ Ce_CAI
))
plot_distributions <- function(dat,
x,
title,
xlabel,
caption = "",
ylabel = "Density (scaled to maximum of 1)",
xlim = c(0,1),
ylim = c(0,1)) {
dat %>%
ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
geom_freqpoly(bins = 50) +
scale_color_carto_d(palette = "Bold",
name = "Species",
labels = c("S. stercoralis",
"S. ratti",
"S. papillosus",
"S. venezuelensis",
"N. brasiliensis",
"B. malayi",
"P. pacificus",
"C. elegans"
)) +
labs(title = title,
subtitle = "Data includes all predicted coding sequences",
x = xlabel,
y = ylabel,
caption = caption) +
coord_equal(xlim = xlim, ylim = ylim) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 10, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
species.specific.dat.df$species_CAI,
"Species-wide codon adaptiveness",
"Codon bias relative to \n genus coding usage rules (CAI)",
"Strongyloides species values relative to S. ratti usage rules
C. elegans values relative to C. elegans usage rules.
N. brasilensis values relative to N. brasilensis usage rules.
P. pacificus values relative to P. pacificus usage rules.
B. malayi values relative to B. malayi ussage rules.")
species_adaptiveness_plot
# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)
# Post-hoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")
temp <- species.specific.dat.df %>%
dplyr::mutate(group = case_when(
species == 'Ss' ~ 'Str',
species == 'Sr' ~ 'Str',
species == 'Sp' ~ 'Str',
species == 'Sv' ~ 'Str',
species == 'Nb' ~ 'Nb',
species == 'Bm' ~ 'Bm',
species == 'Pp' ~ 'Pp',
species == 'Ce' ~ 'Ce'
)) %>%
dplyr::mutate(group = as_factor(group))
one.way <- kruskal.test(species_CAI ~ group, data = temp)
post.hoc <- dunn.test::dunn.test(temp$species_CAI, temp$group, method = "bonferroni")
species.specific.GC.df <- dat.genomic.df %>%
dplyr:: select(geneID, GC, species) %>%
group_by(species)
species_GC_plot <- plot_distributions(species.specific.GC.df,
species.specific.GC.df$GC,
"Species-wide GC content",
"GC content")
species_GC_plot
# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)
# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")
temp <- species.specific.GC.df %>%
dplyr::mutate(group = case_when(
species == 'Ss' ~ 'Str',
species == 'Sr' ~ 'Str',
species == 'Sp' ~ 'Str',
species == 'Sv' ~ 'Str',
species == 'Nb' ~ 'Nb',
species == 'Bm' ~ 'Bm',
species == 'Pp' ~ 'Pp',
species == 'Ce' ~ 'Ce'
)) %>%
dplyr::mutate(group = as_factor(group))
one.way <- kruskal.test(GC ~ group, data = temp)
post.hoc <- dunn.test::dunn.test(temp$GC, temp$group, method = "bonferroni")
percentile = 0.02
# For following analyses, only look at Strongyloides and C. elegans,
# not B. malayi or N. brasiliensis
species.specific.dat.df<-species.specific.dat.df %>%
dplyr::filter(species != "Bm") %>%
dplyr::filter(species != "Nb")%>%
dplyr::filter(species != "Pp")
dat.Top <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Top")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = species_CAI, prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Top)
dat.Bot <- species.specific.dat.df %>%
dplyr::mutate(Description = factor("Bot")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = desc(species_CAI), prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Bot)
tbl <- rbind(dat.Top, dat.Bot) %>%
dplyr::mutate(Description = dplyr::recode(Description, Top = "Most-adapted genes",
Bot = "Least-adapted genes"))
ggplot(tbl, aes(species_CAI,GC, Description)) +
geom_point(data = species.specific.dat.df,
mapping = aes(x = species_CAI, y = GC),color = "grey", shape = 1) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_fixed (3) +
facet_grid( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (genus rules)",
subtitle = "grey icons = all genes \n colored icons = most and least adapted genes ") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(tbl, aes(outgroup_CAI,GC, Description)) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_equal()+
facet_wrap( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to non-genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (non-genus rules)") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8))
temp <- c(Ss = '../Data/SsRNAseq_log2cpm_filtered_norm_voom.csv',
Sr = '../Data/SrRNAseq_log2cpm_filtered_norm_voom.csv',
Sp = '../Data/SpRNAseq_log2cpm_filtered_norm_voom.csv',
Sv = '../Data/SvRNAseq_log2cpm_filtered_norm_voom.csv')
dat.expression <- sapply(temp, read.csv,
header = TRUE)
species_names <- names(dat.expression)
dat.Top.expression <- lapply(species_names, function(x) {
dat.Top.select <- dplyr::filter(dat.Top, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Top.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Top.expression) <- species_names
dat.Top.expression <- map_dfr(dat.Top.expression, bind_rows, .id = "species")
dat.Bot.expression <- lapply(species_names, function(x) {
dat.Bot.select <- dplyr::filter(dat.Bot, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Bot.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Bot.expression) <- species_names
dat.Bot.expression <- map_dfr(dat.Bot.expression, bind_rows, .id = "species")
dat.expression <- lapply(species_names, function(x) {
dat.expression[[x]] %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.expression) <- species_names
dat.expression <- map_dfr(dat.expression, bind_rows, .id = "species")
lapply(species_names, function(x){
plotdata.subset1 <- filter(dat.Top.expression, species == x)
plotdata.subset2 <- filter(dat.Bot.expression, species == x)
plotdata.all <- filter(dat.expression, species == x)
plot.df <- bind_rows(HighCAI = plotdata.subset1,
LowCAI = plotdata.subset2,
AllGenes = plotdata.all, .id = "Description")
p.Bot.expression <- ggplot(plot.df) +
aes(x=stage, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
alpha = .5,
position = position_dodge(width = 1),
trim = FALSE) +
scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
show.legend = FALSE) +
facet_grid(~stage, scales = "free") +
labs(y="log2CPM Expression", x = "Life Stage",
title = paste0(x, " - log2 Counts per Million (CPM)"),
caption=paste0("produced on ", Sys.time())) +
theme_bw()
# suppressMessages(ggsave(paste0(x,"_CAIExpression.pdf"),
# plot = p.Bot.expression,
# device = "pdf",
# height = 4,
# width = 7,
# path = output.path,
# useDingbats=FALSE))
p.Bot.expression
})
temp <- lapply(species_names, function(x) {
dat.select <- dplyr::filter(species.specific.dat.df,
species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
dplyr::ungroup() %>%
dplyr::select(geneID, species_CAI)
subset.expression <- dat.expression %>%
dplyr::filter(species == x)%>%
dplyr::filter(stage == "FLF") %>%
dplyr::filter(geneID %in% dat.select$geneID)
full_join(dat.select, subset.expression, by = "geneID") %>%
dplyr::select(!stage)
})
names(temp) <- species_names
caiExp.df <- map_dfr(temp, bind_rows, .id = "species")
caiExp.df <- bind_rows(AllGenes = caiExp.df,
HighCAI = dplyr::filter(caiExp.df, geneID %in% dat.Top.expression$geneID),
LowCAI = dplyr::filter(caiExp.df, geneID %in% dat.Bot.expression$geneID),
.id = "Description") %>%
dplyr::mutate(species = as_factor(species))
caiExp.highExp_plot <- ggplot(caiExp.df) +
aes(x=Description, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
position = position_dodge(width = 1),
trim = FALSE,
color = 'black') +
#scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
color = "black",
show.legend = FALSE) +
facet_grid( ~species) +
labs(y="log2CPM Expression", x = "Gene subsets (base on CAI value) by species",
title = "log2 Counts per Million (CPM) in free-living female as a function of codon bias",
subtitle = "grey = all genes \ncolors = top/bottom 2% CAI",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
caiExp.highExp_plot
suppressPackageStartupMessages({library(car)})
cpm.merged <- rbind(dat.expression %>% dplyr::mutate(condition = "AllGenes"),
dat.Top.expression %>% dplyr::mutate(condition = "HighCPM"),
dat.Bot.expression %>% dplyr::mutate(condition = "LowCPM")) %>%
dplyr::mutate(condition = as_factor(condition))%>%
group_by(species, geneID, condition, stage)
options(contrasts = c("contr.sum", "contr.poly"))
lapply(species_names, function(x) {
cpm.merged.sub <- filter(cpm.merged, species == x)
res.aov2<- aov(expression ~ condition * stage, data = cpm.merged.sub)
Anova(res.aov2, type = "III")
res.aov3<-TukeyHSD(res.aov2, "condition:stage")
as_tibble(res.aov3$`condition:stage`, rownames = "comparison") %>%
separate(col = comparison, into = c("comp1", "LS1", "comp2", "LS2")) %>%
dplyr::filter(LS1 == LS2) %>%
unite(temp1, comp1, LS1, sep = ":")%>%
unite(temp2, comp2, LS2, sep = ":")%>%
unite(comparison, temp1, temp2, sep = "-")
})
# Carry out GO enrichment using gProfiler2 ----
run_GO <- function (data){
GO.geneID <- data %>%
ungroup() %>%
group_by(species) %>%
dplyr::group_map(~ {
gost.res <- .x %>%
dplyr::select(geneID, GC) %>%
unique()
}, .keep = TRUE)
names(GO.geneID) <- levels(data$species)
gost.results <- lapply(names(GO.geneID), function(y){
organism = case_when (y == "Ss" ~ "ststerprjeb528",
y == "Sr" ~ "strattprjeb125",
y == "Sp" ~ "stpapiprjeb525",
y == "Sv" ~ "stveneprjeb530",
y == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID[[y]]$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results) <- names(GO.geneID)
gost.results
}
find_enrichments <- function (gost.results) {
# Find GO terms that are enriched with p-values equal or small than a threshold.
enriched.terms <- lapply(names(gost.results), function(y){
gost.results[[y]]$result %>%
as_tibble() %>%
dplyr::select(term_id, p_value)
})
names(enriched.terms) <- names(gost.results)
enriched.terms
}
find_common_enrichments <- function (data, p.val,n_groups){
data %>%
dplyr::filter(p_value <= p.val) %>%
group_by(term_id) %>%
summarize(n= n()) %>%
dplyr::filter(n >= n_groups)
}
gost.results.top <- run_GO(dat.Top)
enriched.terms.top <- find_enrichments(gost.results.top)
highTerms.overlap.top <- enriched.terms.top %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.top$Ss, interactive = T, capped = T)
gostplot(gost.results.top$Sr, interactive = T, capped = T)
gostplot(gost.results.top$Sp, interactive = T, capped = T)
gostplot(gost.results.top$Sv, interactive = T, capped = T)
gostplot(gost.results.top$Ce, interactive = T, capped = T)
dplyr::filter(gost.results.top$Ss$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name, intersection)
dplyr::filter(gost.results.top$Ce$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.top,
!term_id %in% gost.results.top$Ce$result$term_id)
dplyr::filter(gost.results.top$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
# Carry out GO enrichment using gProfiler2 ----
gost.results.bot <- run_GO(dat.Bot)
enriched.terms.bot <- find_enrichments(gost.results.bot)
highTerms.overlap.bot <- enriched.terms.bot %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.bot$Ss, interactive = T, capped = T)
gostplot(gost.results.bot$Sr, interactive = T, capped = T)
gostplot(gost.results.bot$Sp, interactive = T, capped = T)
gostplot(gost.results.bot$Sv, interactive = T, capped = T)
gostplot(gost.results.bot$Ce, interactive = T, capped = T)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
dplyr::filter(gost.results.bot$Ce$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.bot,
!term_id %in% gost.results.bot$Ce$result$term_id)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
gost.results.highexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = expression, prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.highexp) <- species_names
enriched.terms.highexp <- find_enrichments(gost.results.highexp)
highTerms.overlap.highexp <- enriched.terms.highexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% highTerms.overlap.highexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
#Overlap btwn GO terms enriched in highly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
#Overlap btwn GO terms enriched in highly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
gost.results.lowexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = desc(expression), prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.lowexp) <- species_names
enriched.terms.lowexp <- find_enrichments(gost.results.lowexp)
highTerms.overlap.lowexp <- enriched.terms.lowexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% highTerms.overlap.lowexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
#Overlap btwn GO terms enriched in poorly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
#Overlap btwn GO terms enriched in poorly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
# Save distribution plot of codon adaptivness by species
ggsave('../Outputs/Fig_S3A.pdf',
plot = species_adaptiveness_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save distribution plot of GC content by species
ggsave('../Outputs/Fig_S3B.pdf',
plot = species_GC_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save plot of CAI vs FLF expression level
ggsave('../Outputs/Fig_S3C.pdf',
plot = caiExp.highExp_plot,
width = 5.5, height = 3,
units = "in", device = "pdf",
useDingbats = FALSE)
write_excel <- function(data, sheet_facet, filename){
file <- paste(filename,".xlsx",sep = "")
# Workbook
to_download <<- createWorkbook()
sapply(seq_along(sheet_facet), function(x){
addWorksheet(wb = to_download, sheetName = sheet_facet[[x]])
sheet_data <- data %>%
dplyr::filter(species == sheet_facet[[x]]) %>%
dplyr::select(!species)
## Results of codon usage analysis
writeData(
to_download,
sheet = x,
x = sheet_data,
startRow = 1,
startCol = 1,
headerStyle = createStyle(
textDecoration = "Bold",
halign = "center",
border = "bottom"
)
)
})
saveWorkbook(to_download, file, overwrite=TRUE)
}
# Save files with top and bottom percentiles of codon adapted genes per species,
# with the GO terms associated with those genes from bioMaRt
Annt1<- getBM(attributes=c('wbps_transcript_id',
'go_accession'),
# grab the ensembl annotations for Wormbase Parasite genes
mart = useMart(biomart="parasite_mart",
dataset = "wbps_gene",
host="https://parasite.wormbase.org",
port = 443),
filters = c('species_id_1010','wbps_transcript_id'),
value = list(c('ststerprjeb528',
'stpapiprjeb525',
'stveneprjeb530'
),
tbl$geneID)) %>%
as_tibble() %>%
group_by(wbps_transcript_id) %>%
dplyr::summarise("Term ID" = paste(go_accession, collapse = ", ")) %>%
dplyr::rename(geneID = wbps_transcript_id)
Annt2<- getBM(attributes=c('external_gene_id',
'go_accession'),
# grab the ensembl annotations for Wormbase Parasite genes
mart = useMart(biomart="parasite_mart",
dataset = "wbps_gene",
host="https://parasite.wormbase.org",
port = 443),
filters = c('species_id_1010',
'gene_name'),
value = list('strattprjeb125',
tbl$geneID)) %>%
as_tibble() %>%
group_by(external_gene_id) %>%
dplyr::summarise("Term ID" = paste(go_accession, collapse = ", ")) %>%
dplyr::rename(geneID = external_gene_id)
Annt3<- getBM(attributes=c('wormbase_gseq',
'go_accession'),
# grab the ensembl annotations for Wormbase Parasite genes
mart = useMart(biomart="parasite_mart",
dataset = "wbps_gene",
host="https://parasite.wormbase.org",
port = 443),
filters = c('species_id_1010',
'wormbase_gseqname'),
value = list('caelegprjna13758',
tbl$geneID)) %>%
as_tibble() %>%
group_by(wormbase_gseq) %>%
dplyr::summarise("Term ID" = paste(go_accession, collapse = ", ")) %>%
dplyr::rename(geneID = wormbase_gseq)
rbind(Annt1, Annt2, Annt3)%>%
left_join(tbl, ., by = "geneID")%>%
dplyr::select(geneID, species_CAI, GC, Description, species, "Term ID") %>%
dplyr::rename("Gene ID" = geneID,
"CAI by genus-specific usage rule" = species_CAI,
"GO term ID" = "Term ID") %>%
write_excel(levels(tbl$species), '../Outputs/File_S3')
# Save list of all GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes
enriched.GO.terms <- rbind(
lowCAI = lapply(names(gost.results.bot), function(y){
gost.results.bot[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Least-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highCAI = lapply(names(gost.results.top), function(y){
gost.results.top[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Most-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything())
)
write_excel(enriched.GO.terms, levels(enriched.GO.terms$species), '../Outputs/File_S4')
enriched.GO.terms.expression <- rbind(
lowFLF = lapply(names(gost.results.lowexp), function(y){
gost.results.lowexp[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv"))) %>%
dplyr::mutate(Description = "Least-expressed genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highFLF = lapply(names(gost.results.highexp), function(y){
gost.results.highexp[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv"))) %>%
dplyr::mutate(Description = "Most-expressed genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything())
)
# Save list of GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes and genes that are highly expressed/poorly expressed in FLF
common.terms <- rbind (lowCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.bot$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Least-adapted genes"),
highCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.top$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Most-adapted genes"),
Highexp = dplyr::filter(enriched.GO.terms.expression, term_id %in% highTerms.overlap.highexp$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Highly-expressed genes"),
PoorExp = dplyr::filter(enriched.GO.terms.expression, term_id %in% highTerms.overlap.lowexp$term_id) %>%
group_by(source, term_id, term_name) %>%
dplyr::select(term_id, source,term_name, intersection) %>%
dplyr::summarise(Genes = paste(intersection, collapse = ", ")) %>%
dplyr::mutate(Description = "Poorly-expressed genes")
) %>%
dplyr::mutate(Description = factor(Description)) %>%
dplyr::rename(species = Description, "Term ID" = term_id, "Term Name" = term_name, Source = source, "Contributing Genes" = Genes)
write_excel(common.terms, levels(common.terms$species), '../Outputs/File_S5')
# Save Functional Enrichment Plots for bottom percentile
lapply(names(gost.results.bot), function(y){
gost.res <- gost.results.bot[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Bottom", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.bot$term_id)
ggsave(paste0('../Outputs/Fig_4C_',y,'.pdf'), plot = gost.ggplot.pub,
width = 7, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
})
# Save Functional Enrichment Plots for top percentile
lapply(names(gost.results.top), function(y){
gost.res <- gost.results.top[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Top", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.top$term_id)
ggsave(paste0('../Outputs/Fig_4A_',y,'.pdf'), plot = gost.ggplot.pub,
width = 7, height = 5,
units = "in", device = "pdf",
useDingbats = FALSE)
})